Linear model to predict wage using age
library(ISLR)
attach(Wage)
fit=lm(wage~poly(age,degree=4,raw=T),data=Wage)
coef(summary(fit))
## Estimate Std. Error t value
## (Intercept) -1.841542e+02 6.004038e+01 -3.067172
## poly(age, degree = 4, raw = T)1 2.124552e+01 5.886748e+00 3.609042
## poly(age, degree = 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743
## poly(age, degree = 4, raw = T)3 6.810688e-03 3.065931e-03 2.221409
## poly(age, degree = 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938
## Pr(>|t|)
## (Intercept) 0.0021802539
## poly(age, degree = 4, raw = T)1 0.0003123618
## poly(age, degree = 4, raw = T)2 0.0062606446
## poly(age, degree = 4, raw = T)3 0.0263977518
## poly(age, degree = 4, raw = T)4 0.0510386498
Predict and boundaries
agelims=range(age)
age.grid=seq(from=agelims[1],to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
Plot
plot(age,wage,xlim=agelims,col="darkgrey")
title("Degree -4 Polynomial", out=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
Use ANOVA to determine the degree of the poly to use.
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Logistic Regression to predict whether wage>250 using age
fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial)
preds=predict(fit,newdata=list(age=age.grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(se.bands.logit))
Plot
plot(age,I(wage>250),xlim=agelims,type="n",tylim=c(0,.2))
## Warning in plot.window(...): "tylim"不是图形参数
## Warning in plot.xy(xy, type, ...): "tylim"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...): "tylim"不是图
## 形参数
## Warning in axis(side = side, at = at, labels = labels, ...): "tylim"不是图
## 形参数
## Warning in box(...): "tylim"不是图形参数
## Warning in title(...): "tylim"不是图形参数
points(jitter(age),I((wage>250)/5),cex=0.5,pch="|",col="darkgrey")
lines(age.grid,pfit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
Use step function to predict wage using age
chunck=cut(age,4)
table(chunck)
## chunck
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit=lm(wage~chunck,data=Wage)
newChuck=cut(age.grid,4)
preds=predict(fit,newdata = list(chunck=newChuck),se=T)
se.bounds=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
Plot
plot(age,wage,xlim=agelims,col="darkgrey")
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bounds,lwd=1,col="blue",lty=3)
Use splines to predict wage using age
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
preds=predict(fit,newdata=list(age=age.grid),se=T)
se.bounds=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
Plot
plot(age,wage,xlim=agelims,col="darkgrey")
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bounds,lwd=1,col="blue",lty=3)
Use nature splines to predict wage using age
fit=lm(wage~ns(age,df=6),data=Wage)
preds=predict(fit,newdata=list(age=age.grid),se=T)
se.bounds=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
Plot
plot(age,wage,xlim=agelims,col="darkgrey")
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bounds,lwd=1,col="blue",lty=3)
Use smoothing splines to predict wage using age
fit=smooth.spline(age,wage,df=16)
fit2=smooth.spline(age,wage,cv=T)
## Warning in smooth.spline(age, wage, cv = T): cross-validation with non-
## unique 'x' values seems doubtful
fit2$df
## [1] 6.794596
preds=predict(fit,newdata=list(age=age.grid),se=T)
Plot
plot(age,wage,xlim=agelims,col="darkgrey")
#lines(age.grid[2:62],preds$y,col="red",lwd=2)
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
legend("topright",legend=c("DF 16","CV: DF 6.8"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)
Use local regression to predict wage using age
fit=loess(wage~age,span=0.2,data=Wage)
fit2=loess(wage~age,span=0.5,data=Wage)
preds=predict(fit,data.frame(age=age.grid))
preds2=predict(fit2,data.frame(age=age.grid))
Plot
plot(age,wage,xlim=agelims,col="darkgrey")
lines(age.grid,preds,col="red",lwd=2)
lines(age.grid,preds2,col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)
gam1=lm(wage~ns(year,df=4)+bs(age,df=5)+education,data=Wage)
library(gam)
## Warning: package 'gam' was built under R version 3.2.5
## Loading required package: foreach
## Loaded gam 1.14
gam2=gam(wage~s(year,4)+s(age,5)+education,data=Wage)
gam3=gam(wage~s(year,4)+lo(age,span=0.7)+education,data=Wage)
gam4=gam(I(wage>250)~s(year,4)+lo(age,span=0.7)+education,family=binomial,data=Wage)
par(mfrow=c(1,3))
plot.gam(gam1,se=T,col="blue")
plot.gam(gam2,se=T,col="red")
plot.gam(gam3,se=T,col="green")
plot.gam(gam3,se=T,col="grey")
We can also include interaction using local weighted regression:
gam.i=gam(wage~lo(year,age,span=0.5)+education,data=Wage)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame,
## bf.maxit, : liv too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame,
## bf.maxit, : lv too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame,
## bf.maxit, : liv too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame,
## bf.maxit, : lv too small. (Discovered by lowesd)
library(akima)
plot(gam.i)